Riemannian metric from manifold learning
library(tidyverse)
library(dimRed)
# library(dtplyr)
library(reticulate)
library(viridis)
# library(ks)
library(hdrcde)
library(igraph)
# library(seurat)
library(matrixcalc)
library(akima)
Jmisc::sourceAll(here::here("R/sources"))## Loading...
## /Users/fche0019/git/kderm/R/sources/add_global_label.R
## /Users/fche0019/git/kderm/R/sources/annHLLE.R
## /Users/fche0019/git/kderm/R/sources/annIsomap.R
## /Users/fche0019/git/kderm/R/sources/annLaplacianEigenmaps.R
## /Users/fche0019/git/kderm/R/sources/annLLE.R
## /Users/fche0019/git/kderm/R/sources/cal_recall.R
## /Users/fche0019/git/kderm/R/sources/calc_ann_table.R
## /Users/fche0019/git/kderm/R/sources/dimRedResult-class.R
## /Users/fche0019/git/kderm/R/sources/dr_quality.R
## /Users/fche0019/git/kderm/R/sources/hdrscatterplot.R
## /Users/fche0019/git/kderm/R/sources/makeKNNgraph.R
## /Users/fche0019/git/kderm/R/sources/metricML.R
## /Users/fche0019/git/kderm/R/sources/mnist_anntable_plotting.R
## Done
Smart meter data for one household
# load(here::here("data/spdemand_3639id336tow.rda"))
# nid <- 1
# ntow <- 336
#
# train <- spdemand %>%
# lazy_dt() %>%
# filter(tow <= ntow,
# # id <= sort(unique(spdemand[,id]))[nid],
# id == 1003) %>%
# dplyr::select(-id, -tow) %>%
# data.table::as.data.table()
train <- readRDS(here::here("data/spdemand_1id336tow_train.rds"))
(N <- nrow(train))## [1] 336
Metric learning
# Parameters fixed
x <- train
s <- 2
k <- 20
method <- "annIsomap"
annmethod <- "kdtree"
distance <- "euclidean"
treetype <- "kd"
searchtype <- "radius" # change searchtype for radius search based on `radius`, or KNN search based on `k`
radius <- .4Radius searching with k-d trees
metric_isomap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype)
summary(metric_isomap)## Length Class Mode
## embedding 672 -none- numeric
## rmetric 1344 -none- numeric
## weighted_graph 10 igraph list
## adj_matrix 112896 dgCMatrix S4
The function metricML() returns a list of
- the embedding coordinates
embeddingof - the embedding metric
rmetricfor each point \(p \in x\) as an array - the weighted neighborhood graph as an
igraphobjectweighted_graph - the sparse adjacency matrix from the graph
adj_matrix
Embedding plots
# Comparison of Isomap embedding plot
par(mfrow = c(1, 2))
plot(metric_isomap$embedding, col = viridis::viridis(24)) # metricML
# plot(e@data@data, col = viridis::viridis(24)) # embedding from dimRed, same as metricML()
emb_isomap <-
feather::read_feather(here::here("data/embedding_isomap_1id336tow.feather"))
plot(emb_isomap$`0`, emb_isomap$`1`, col = viridis::viridis(24)) # embedding from megaman, radius = 0.4Riemannian metric
# Use adjacency matrix as input for metricML and dimRed, then the embedding should be close, as well as the Riemannian metric
metric_isomap$adj_matrix[1:10, 1:10]# renormalized adjacency matrix## 10 x 10 sparse Matrix of class "dgCMatrix"
##
## [1,] 1.0000000 0.25392915 0.1208264 . . . .
## [2,] 0.2539292 1.00000000 0.3342501 0.1775332 0.1076027 0.09006124 .
## [3,] 0.1208264 0.33425008 1.0000000 0.3861779 0.2245130 0.16488133 0.1203864
## [4,] . 0.17753315 0.3861779 1.0000000 0.4971605 0.30586322 0.2255448
## [5,] . 0.10760270 0.2245130 0.4971605 1.0000000 0.40429943 0.3473452
## [6,] . 0.09006124 0.1648813 0.3058632 0.4042994 1.00000000 0.4308813
## [7,] . . 0.1203864 0.2255448 0.3473452 0.43088129 1.0000000
## [8,] . . 0.1020702 0.1884481 0.2616027 0.31973425 0.4748913
## [9,] . 0.09017866 0.1700980 0.3170741 0.4422321 0.61829795 0.4195736
## [10,] . 0.09494747 0.1670566 0.3008233 0.4007255 0.32522372 0.2918485
##
## [1,] . . .
## [2,] . 0.09017866 0.09494747
## [3,] 0.1020702 0.17009799 0.16705665
## [4,] 0.1884481 0.31707414 0.30082327
## [5,] 0.2616027 0.44223208 0.40072551
## [6,] 0.3197343 0.61829795 0.32522372
## [7,] 0.4748913 0.41957363 0.29184850
## [8,] 1.0000000 0.29214049 0.22131497
## [9,] 0.2921405 1.00000000 0.38466228
## [10,] 0.2213150 0.38466228 1.00000000
metric_isomap$weighted_graph %>% is.connected()## [1] TRUE
riem_isomap <- metric_isomap$rmetric
riem_isomap[,,1] %>%
# isSymmetric()
matrixcalc::is.positive.definite() # FALSE## [1] TRUE
The Riemannian metric from the metricML() function is now positive definite for all points.
Now compare it with the megaman output.
np = import("numpy")
H_isomap <- np$load(here::here("data/hmatrix_isomap_1id336tow.npy"))
H_isomap[1,,, drop=TRUE] %>%
matrixcalc::is.positive.definite() # TRUE## [1] TRUE
pd <- rep(NA, N)
for (i in 1:N) {
pd[i] <-
riem_isomap[,,i] %>% # R
# H_isomap[i,,, drop=TRUE] %>% # Python
# isSymmetric()
matrixcalc::is.positive.definite() # FALSE
}
pd %>% sum## [1] 336
# determinant(riem_isomap[,,i])
# mat <- riem_isomap[,,i]
# eigen(mat)Debug: riemmanien metric not positive definite (DONE)
Question: footnote on P15 of the paper, how to decide the constant c?
In the case of heat kernel (1), c = 1/4, which - crucially - is independent of the dimension of M.
# Function for graph Laplacian
# Input: W: N*N weight matrix for the neighborhood graph, radius: bandwidth parameter
# Output: L: N*N graph Laplacian matrix
Laplacian <- function(W, radius, lambda = 1){
D <- Matrix::Diagonal(x = rowSums(W)^(-lambda)) # inverse of a diagonal matrix
W1 <- D %*% W %*% D
D1 <- Matrix::Diagonal(x = 1 / rowSums(W1)) # inverse of Tn1
L <- 1 / (radius^2 / 4) * (D1 %*% W1 - Matrix::Diagonal(nrow(W))) # c=1/4 for heat kernel, depending on the choice of weights, P15 footnote
return(L)
}# Function for Riemannian metric for each point
# The Riemannian metric and its dual associated with an embedding Y.
# The intrinsic dimension d
# The Riemannian metric is currently denoted by G, its dual metric by H, and the Laplacian by L.
# G at each point is the matrix inverse of H.
riemann_metric <- function(Y, laplacian, d){
# TODO: add dimension check for all inputs
H <- array(NA, dim = c(d, d, nrow(Y)))
for (i in 1:d) {
for (j in i:d) {
yij <- Y[, i] * Y[, j]
H[i, j, ] <- as.vector(0.5 * (laplacian %*% yij - Y[, i] * (laplacian %*% Y[, j]) - Y[, j] * (laplacian %*% Y[, i])))
H[j, i, ] <- H[i, j, ] # symmetric matrix
}
}
## Pseudo inverse of H gives the final embedding metric h
## Array H corresponds to \tilde{H} in Step 4(a)
## The embedding metric H is the pseudo inverse of \tilde{H}
for (i in 1:nrow(Y)) {
Hsvals <- eigen(H[ , ,i])
Huu <- Hsvals$vectors
Hvv <- Hsvals$values[1:d] # top d largest eigenvalues, already sorted in decreasing order
Hvv1 <- diag(x = 1 / Hvv)
H[ , ,i] <- Huu %*% Hvv1 %*% t(Huu)
H[, , i] <- 0.5 * (H[, , i] + t(H[, , i])) # fix not symmetric issue
}
return(H)
}Variable kernel density estimate
For multivariate data, the variable kernel density estimate is given by \[\hat{f}(x) = n^{-1} \sum_i K_H (x - X_i).\]
# 2-d kde
mkde <- function (x, h) {
# Data is a matrix of n*d
# H is an array of dimension (d,d,n)
start <- Sys.time()
n <- nrow(x)
if (dim(x)[2] < 2)
stop("Data matrix has only one variable.")
if (any(!is.finite(x)))
stop("Missing or infinite values in the data are not allowed! ")
if (!all.equal(nrow(h), ncol(h), dim(x)[2]))
stop("The bandwidth matrix is of wrong dimension. ")
s <- rep(0, n)
y <- rep(0, n)
for (i in 1:n) {
for (j in 1:n){
s[i] <- s[i] + abs(det(h[,,i]))^(-1/2) * exp(- 1/2 * t(x[i,] - x[j,]) %*% solve(h[,,i]) %*% (x[i,] - x[j,]))
}
y[i] <- s[i] / (n * 2 * pi)
}
print(Sys.time() - start)
return(y)
}Test the variable KDE function with megaman output
If we use the embedding x and the Riemannian metric h from megaman, we could produce contour plots.
x <- cbind(emb_isomap$`0`, emb_isomap$`1`)
pyriem_isomap <- array(NA, dim = c(s, s, N))
for(i in 1:N){
pyriem_isomap[,,i] <- H_isomap[i,,]
}
f <- mkde(x = x, h = pyriem_isomap)## Time difference of 5.880567 secs
summary(f)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1393 0.4059 0.8692 0.8075 1.1473 1.5196
# Top 20 index with the lowest densities
head(order(f), n=20)## [1] 326 38 134 182 324 135 327 325 37 133 39 278 181 328 86 277 230 144 85
## [20] 183
Contour plot for megaman
embed_den <- as_tibble(cbind(x = x[,1], y = x[,2], z = f)) %>%
drop_na()
# library(akima)
pixel <- 100
lenbreak <- 5
akima.spl <- interp(embed_den$x, embed_den$y, embed_den$z, nx=pixel, ny=pixel, linear=FALSE)
p.zmin <- min(embed_den$z,na.rm=TRUE)
p.zmax <- max(embed_den$z,na.rm=TRUE)
breaks <- pretty(c(p.zmin,p.zmax), lenbreak)
# hdrcde
hdrscatterplot(embed_den$x, embed_den$y, noutliers = 10)hdrinfo <- hdr.2d(embed_den$x, embed_den$y, prob = c(50, 95, 99))
plot.hdr2d(hdrinfo)
# Contour
contour(akima.spl, main = "smooth interp(*, linear = FALSE)", col = viridis(length(breaks)-1), levels=breaks, add=TRUE)
points(embed_den, pch = 20)filled.contour(akima.spl, color.palette = viridis,
plot.axes = { axis(1); axis(2);
title("smooth interp(*, linear = FALSE)");
points(embed_den, pch = 3, col= hcl(c=20, l = 10))})Contour plot after interpolation for R metricML()
Now apply the mkde() function to the output of metricML() written in R.
x <- metric_isomap$embedding
riem_isomap <- metric_isomap$rmetric
f <- mkde(x = x, h = riem_isomap)## Time difference of 5.768383 secs
summary(f)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0009714 0.0019742 0.0023836 0.0026091 0.0032485 0.0053262
head(order(f), n=20)## [1] 298 297 145 300 296 250 249 98 198 152 55 56 251 287 299 146 199 200 193
## [20] 335
We can check the outliers based on density f. In R, the outliers are different from megaman output.